Dynamic Topic Modelling of r/politics subreddit

This project allows:

  1. To gather data from Reddit and save it in csv format
  2. To clean gathered data and explore it
  3. To extract the main topics from gathered data
  4. To visualise dynamic changes of topics over time

To extract topics, we use BERTopic library, which performs topic modeling using clustering of vector representations of documents. The main differences between BERTopic and other topic models:

  1. High speed due to reducing the dimensionality of vector representations.
  2. Modular structure of the model pipeline: the stages of vectorization, dimensionality reduction and clustering are separated from each other, which allows you to easily and quickly experiment with different combinations of algorithm settings.
  3. The model pipeline consists of SOTA tools: SBERT, UMAP, HDBSCAN. Combined, this allows you to get the best results compared to other models.

This project can be easily adjusted to other sources of information, which allows you to conduct different experiments.

Install libraries

import os
from datetime import datetime
import time
from tqdm import tqdm
import pandas as pd
import spacy
import re

from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired

from sentence_transformers import SentenceTransformer

# from umap import UMAP
from cuml import UMAP

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

# from hdbscan import HDBSCAN
from cuml.cluster.hdbscan import HDBSCAN

import plotly.io as pio

pio.renderers.default = "notebook+vscode+jupyterlab"

sns.set_theme(style="darkgrid")
%config InlineBackend.figure_format = "retina"

# Dictionaries:
# en_core_web_sm
# en_core_web_md
# en_core_web_lg
# en_core_web_trf

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"],
)

spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)

Load and clean data from csvs

BERTopic uses Transformers. The model learns better if it receives more information from the text. Therefore, preprocessing is minimal.

Function to clean data from HTML elements using regular expressions

def regex_preprocessing(text):
    # Remove URL
    text = re.sub(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        " ",
        text,
    )

    text = re.sub(
        r"\(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\)",
        " ",
        text,
    )

    # Remove special symbols
    text = re.sub(r"\n|\r|\<.*?\>|\{.*?\}|u/|\(.*emote.*\)|\[gif\]|/s|_", " ", text)
    text = re.sub(r"[^\w0-9'’“”%!?.,-:*()><]", " ", text)

    # Remove unnecessary brackets
    text = re.sub(r"\s\(\s", " ", text)

    # Delete unnecessary whitespaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()

Function to convert data to a dataframe, drop duplicates in the dataframe and to apply ‘regex_preprocessing’ function to data

def data_preprocessing(file_name):
    data = pd.read_csv(file_name)
    data_cleaned = data.drop_duplicates(keep=False)
    data_cleaned["comments"] = data_cleaned["comments"].apply(regex_preprocessing)

    return data_cleaned

Function to create a dataframe with a cleaned data

This function consists of several steps: 1. Firstly, it gets names of csv files in a chosen folder 2. Secondly, it applies ‘data_preprocessing’ function to csv’s to create dataframes with cleaned data 2. Lastly, it creates a combined dataframe with cleaned data

def process_data(directory):
    file_names = []
    for filename in os.listdir(directory):
        file = os.path.join(directory, filename)
        file_names.append(file)
    file_names.sort()

    dataframes = []
    for name in file_names:
        dataframes.append(data_preprocessing(name))

    cleaned_df = (
        pd.concat(dataframes)
        .drop(columns="time", axis=1)
        .reset_index(drop=True)
        .drop_duplicates()
        .dropna()
    )

    return cleaned_df

Apply data processing functions to gathered data

For this experiment, we load cvs’s with data marked as ‘hot’ by reddit algorithms.

directory = "original_data/hot"
combined_df = process_data(directory)
len(combined_df["comments"].to_list())
264230

Convert the dataframe to a list for a further work

comments = combined_df["comments"].to_list()
timestamps = combined_df["date"].to_list()

Create embeddings from cleaned data

The gte-small model was chosen using the Hugging Face benchmark. It is lightweight and works well with Reddit data.

# Pre-calculate embeddings
embedding_model = SentenceTransformer(
    model_name_or_path="thenlper/gte-small",
    cache_folder="transformers_cache",
)
embeddings = embedding_model.encode(comments, show_progress_bar=True)

Plot data distribution

We use umap to reduce the dimensionality of the data, which makes it easier to cluster the data using HDBSCAN.

def plot_umap(embeddings, values):
    neighbors_list = values

    fig, axes = plt.subplots(2, 5, figsize=(27, 10), sharex=True, sharey=True)

    axes = axes.flatten()
    for ax, neighbors in tqdm(zip(axes, neighbors_list)):
        umap_model = UMAP(
            n_neighbors=neighbors, n_components=2, min_dist=0.0, metric="cosine"
        )
        # Apply UMAP to our data
        umap_result = umap_model.fit_transform(embeddings)
        # Visualise the results
        ax.scatter(
            umap_result[:, 0], umap_result[:, 1], alpha=0.3, c="orangered", s=0.2
        )
        ax.set_title(f"UMAP, n_neighbors = {neighbors}")
        ax.set_xlabel("Компонента 1")
        ax.set_ylabel("Компонента 2")
    lim = 7
    plt.ylim(-lim, lim)
    plt.xlim(-lim, lim)
    plt.tight_layout()
    plt.show()


def plot_hdbscan(embeddings, umap_values, hdbscan_values):
    for n in umap_values:
        # Apply UMAP to our data
        umap_model = UMAP(n_neighbors=n, n_components=2, min_dist=0.0, metric="cosine")
        umap_result = umap_model.fit_transform(embeddings)

        # HDBSCAN
        sizes = hdbscan_values

        fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharex=True, sharey=True)

        axes = axes.flatten()
        for ax, size in tqdm(zip(axes, sizes)):
            # Cluster data with HDBSCAN
            hdbscan_model = HDBSCAN(
                min_cluster_size=size, metric="euclidean", prediction_data=True
            )
            hdbscan_labels = hdbscan_model.fit_predict(umap_result)
            # Create a dataframe with results of UMAP and HDBSCAN
            df = pd.DataFrame(
                umap_result, columns=[f"UMAP{i+1}" for i in range(0, 2, 1)]
            )
            df["Cluster"] = hdbscan_labels
            # scatterplot for results
            sns.scatterplot(
                x="UMAP1",
                y="UMAP2",
                hue="Cluster",
                data=df,
                palette="tab10",
                legend=None,
                linewidth=0,
                s=1,
                ax=ax,
            ).set_title(f"n_neighbors={n}, min_cluster_size={size}")
            ax.set_xlabel("Компонента 1")
            ax.set_ylabel("Компонента 2")
        lim = 7
        plt.ylim(-lim, lim)
        plt.xlim(-lim, lim)
        plt.tight_layout()
        plt.show()

We plot a range of values to see how a structure of data changes: from a more local structure to a global one.

plot_umap(embeddings, np.arange(10, 56, 5))
10it [01:02,  6.29s/it]

We can see sizes of created clusters with different parameter combinations.

plot_hdbscan(embeddings, [10, 15, 25], [15, 35, 50, 75])
4it [01:57, 29.40s/it]

4it [02:06, 31.69s/it]

4it [01:59, 29.95s/it]

Extract topics using BERTopic

This part is purely experimental and requires a lot of time to tune hyperparameters of models to get the best ouput results.

Current hyperparameters are chosen taking into account the goals:

  1. Preserve the local structure of the data after reducing the dimensionality of the data with UMAP
  2. Reduce the amount of noise in clusters and create an adequate number of topics with HDBSCAN
  3. Create a list of understandable topics at the output

In this experiment, we use MaximalMarginalRelevance topic representation model, which changes the order of words in topics to remove semantic repetitions and create a sequence of the most significant words.

We use CountVectorizer from scikit-learn to: 1. remove very rare and frequent words from the final topic representations 2. create n-grams, up to 2 words in total 3. remove stopwords using spaCy stopwords list

Start Topic Modelling Pipeline

# UMAP init
umap_model = UMAP(n_neighbors=25, n_components=5, min_dist=0.0, metric="cosine")

# HDBSCAN init
hdbscan_model = HDBSCAN(min_cluster_size=50, metric="euclidean", prediction_data=True)

# Remove noise from created topics
vectorizer_model = CountVectorizer(
    stop_words=spacy_stopwords, min_df=0.03, max_df=0.99, ngram_range=(1, 2)
)

# BERTopic model init
representation_model = MaximalMarginalRelevance()
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    verbose=True,
)

# Fit the model
topics, probs = topic_model.fit_transform(comments, embeddings)
2024-12-06 00:54:49,925 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-06 00:55:06,919 - BERTopic - Dimensionality - Completed ✓
2024-12-06 00:55:06,922 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [00:54:56.293848] Transform can only be run with brute force. Using brute force.
2024-12-06 00:55:58,181 - BERTopic - Cluster - Completed ✓
2024-12-06 00:55:58,212 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-06 00:56:14,733 - BERTopic - Representation - Completed ✓

Get topics

# Get topics
topic_representation = topic_model.get_topic_info()
topic_representation["Name"].to_list()
['-1_trump_don_vote_election',
 '0_harris_vote harris_voted harris_harris wins',
 '1_trump_trump supporters_voted trump_donald trump',
 '2_campaign_didn_candidate_vote',
 '3_russia_putin_ukraine_nato',
 '4_orange_face_looks like_tells like',
 '5_israel_gaza_palestine_palestinians',
 '6_biden_garbage_joe biden_blame biden',
 '7_rfk_vaccines_vaccine_rfk jr',
 '8_christian_christians_religion_religious',
 '9_women_husbands_wives_women voted',
 '10_tariffs_inflation_economy_prices',
 '11_kamala wins_voted kamala_vote kamala_kamala campaign',
 '12_media_fox news_journalism_propaganda',
 '13_democrats_dems_democratic party_republicans',
 '14_abortion_roe_abortions_pregnancy',
 '15_trans_trans people_gender_transgender',
 '16_vote_popular vote_election_won popular',
 '17_musk_elon musk_tesla_spacex',
 '18_america_american_americans_usa',
 '19_thank_thanks_correct_ha ha',
 '20_rapist_rape_pedophile_raped',
 '21_vance_jd vance_peter thiel_25th',
 '22_deported_deport_deportation_immigration',
 '23_rogan_joe_joe rogan_podcast',
 '24_court_supreme court_scotus_justices',
 '25_bernie_dnc_bernie sanders_like bernie',
 '26_hitler_nazi_nazis_trump hitler',
 '27_voting_didn vote_people vote_don vote',
 '28_gaetz_matt_matt gaetz_mccarthy',
 '29_fascist_fascism_fascists_mussolini',
 '30_project_project 2025_2025_heritage foundation',
 '31_elon_trump elon_like elon_elon trump',
 '32_iowa_selzer_poll_selzer poll',
 '33_democracy_republic_democracies_american democracy',
 '34_boomers_generation_millennials_millennial',
 '35_maga_maga cult_maga people_maga don',
 '36_fraud_voter fraud_election fraud_steal election',
 '37_education_schools_department education_education system',
 '38_states_blue states_california_red states',
 '39_republicans_republican_republican party_republicans don',
 '40_epstein_jeffrey epstein_trump epstein_tapes',
 '41_aca_obamacare_repeal_affordable care',
 '42_prison_jail_sentencing_sentenced',
 '43_dumb_idiots_fucking stupid_stupidity',
 '44_gun_guns_firearms_second amendment',
 '45_eggs_egg_price eggs_cheaper',
 '46_military_generals_officers_officer',
 '47_recess_senate_mcconnell_senators',
 '48_ballots_voting_early voting_election day',
 '49_puerto_rico_puerto rico_ricans',
 '50_twitter_deleted_social media_like twitter',
 '51_polls_poll_polls vote_polling',
 '52_leopards_leopard_eat face_leopards eating',
 '53_conservatives_conservative_liberals_liberal',
 '54_texas_cruz_ted cruz_texan',
 '55_echo chamber_politics_echo chambers_reddit echo',
 '56_garland_merrick_merrick garland_doj',
 '57_racism_white people_white man_white men',
 '58_kids_dad_parent_family',
 '59_reagan_ronald reagan_nixon_trickle economics',
 '60_headline_read_click bait_didn read',
 '61_betting_markets_odds_bets',
 '62_walz_tim walz_tim_governor',
 '63_stein_green_jill stein_green party',
 '64_canada_canadian_immigration_conservative party',
 '65_cheney_liz_liz cheney_dick cheney',
 '66_war_firing_firing squad_war hawk',
 '67_wanted_happen_lol won_want',
 '68_newsom_shapiro_gavin_whitmer',
 '69_2016_like 2016_2016 2016_2020 2016',
 '70_shocked_surprise_surprised_shocked shocked',
 '71_latinos_latino_hispanic_hispanics',
 '72_stock_djt_shares_dump',
 '73_tucker_tucker carlson_gay_tim walz',
 '74_georgia_nc_counties_turnout',
 '75_law_laws_enforced_rule law',
 '76_fbi_clearance_security clearance_background checks',
 '77_cooked_dogs_pizza_cats',
 '78_joke_funny_laugh_laughed',
 '79_truck_garbage truck_cab_trucks',
 '80_states rights_state rights_states right_state governments',
 '81_dad_parents_family_voted',
 '82_billionaires_billionaire_millionaires_wealth',
 '83_porn_pornography_ban_banning',
 '84_veterans_va_vets_veteran',
 '85_tax_taxes_pay taxes_irs',
 '86_win_lose_losing_wins',
 '87_climate_climate change_warming_global warming',
 '88_pardon_pardons_trump pardon_pardoned',
 '89_tulsi_gabbard_tulsi gabbard_russian asset',
 '90_cross_tattoos_nazi_white supremacist',
 '91_worm_worms_brain worm_brain worms',
 '92_gay_gay guy_gay people_gays',
 '93_signs_trump signs_harris signs_sign',
 '94_fluoride_water_dental_teeth',
 '95_idiocracy_smartest_smart_idiots',
 '96_mirage_red mirage_wave_blue wave',
 '97_years years_years going_wait years_happen years',
 '98_waiting_going happen_days_wait',
 '99_celebrities_celebrity_endorsements_celebrity endorsements',
 '100_garbage_trash_trash bags_garbage bags',
 '101_economist_publication_conservative_economists',
 '102_housing_homes_25k_houses',
 '103_newt_gingrich_cheated_lie',
 '104_democrats_party_democratic_dems',
 '105_sad_cried_cry_crying',
 '106_wins_winning_win think_believe won',
 '107_elections_election_fair election_election years',
 '108_police_cops_cop_officers',
 '109_dementia_symptoms_cognitive_signs',
 '110_civil war_war_confederate_confederacy',
 '111_miller_stephen miller_goebbels_nazi',
 '112_bolton_john bolton_war_iraq',
 '113_terrorists_terrorism_domestic terrorists_domestic terrorist',
 '114_dictator_dictator day_dictators_dictatorship',
 '115_robinson_mark robinson_nc_race',
 '116_microphone_mic_microphones_blew',
 '117_clown_circus_clowns_clown car',
 '118_pence_mike pence_coward_january',
 '119_magats_magat_drew_cultists',
 '120_oligarchy_oligarchs_oligarch_billionaires',
 '121_national guard_guard_guards_governors',
 '122_fraud_scam_claims_scamming',
 '123_tiktok_tik tok_tik_videos',
 '124_bezos_amazon_wapo_jeff bezos',
 '125_vote blue_voted blue_voting blue_voted',
 '126_fema_hurricane_hurricanes_weather',
 '127_lottery_random_fraud_giveaway',
 '128_percent_percentages_estimate_10',
 '129_holiday_holidays_election day_voting',
 '130_young_young voters_young people_voting age',
 '131_bots_bot_trolls_thread',
 '132_felon_felons_convicted felon_felony',
 '133_combat_infantry_women_military',
 '134_jordan_jim jordan_jim_sex crimes',
 '135_reddit_posts_subreddits_mods',
 '136_ai_data_trained_human',
 '137_woke_wokeness_identity_woke agenda',
 '138_leopardsatemyface_sub going_subreddit_voted red',
 '139_wife_affair_married_divorced',
 '140_filibuster_senate_removed_judges',
 '141_billion_billions_billionaire_wealth',
 '142_arsonist_fire_burning_burned',
 '143_walker_herschel_herschel walker_missile',
 '144_taliban_afghanistan_women_hate women',
 '145_citizenship_spain_eu_netherlands',
 '146_libs_owning libs_owning_owned',
 '147_prince_order_titles_royal',
 '148_lawsuits_lawsuit_sue_lawyers',
 '149_social security_ss_medicare_retirement',
 '150_black people_black voters_black men_white people',
 '151_fucked_worse_fucked going_bleak',
 '152_pelosi_nancy pelosi_insider trading_speaker',
 '153_golf_golfing_golf course_play golf',
 '154_leak_leaked_leaks_leaking',
 '155_working class_unions_class working_class people',
 '156_palpatine_star wars_darth_empire',
 '157_late_little late_bit late_better late',
 '158_lie_lies_lies lies_lying',
 '159_susan_collins_shocked_learned lesson',
 '160_graham_lindsay_lindsay graham_spineless',
 '161_jail_trump jail_criminal_charges trump',
 '162_concept plan_concepts plan_sure plan_think plan',
 '163_elmo_twitter_bought_half things',
 '164_swamp_drain swamp_drain_draining',
 '165_oliver_john oliver_freaks_american citizen',
 '166_efficiency_new department_government efficiency_department government',
 '167_dc_statehood_maryland_islands',
 '168_empathy_want hurt_sympathy people_lack empathy',
 '169_mooch_mooches_metric_imperial',
 '170_alex jones_alex_jones_press secretary',
 '171_debts_secret service_pays_bills',
 '172_newsweek_trash_pro harris_posts',
 '173_corruption_corrupt_bribe_bribery',
 '174_treason_traitors_traitor_jail',
 '175_brexit_uk_referendum_brits',
 '176_citizens united_citizens_ruling_corporations',
 '177_mr trump_mr_trump_libertarian',
 '178_incels_incel_won listen_society',
 '179_onion_headline_infowars_satire',
 '180_smith_jack smith_deported_love know',
 '181_fuck em_fuck good_fuck fuck_fuckin',
 '182_putin_elon trump_trump elon_putin trump',
 '183_bannon_steve bannon_prison_look foolish',
 '184_hogan_hulk_hulk hogan_kid rock',
 '185_rally_rallies_said rally_cheers',
 '186_gerrymandering_gerrymandered_districts_statewide',
 '187_loan_loans_student loan_student loans',
 '188_party_parties_party system_party maybe',
 '189_tester_montana_jon_senate',
 '190_basketball_cleveland_james_miami',
 '191_couch_couches_couch fucker_fucker',
 '192_leon_wtf_cult don_knows',
 '193_smell_poop_poo_smells',
 '194_anthony_likable_bad trump_bail',
 '195_cheating_cheated_cheat_time don',
 '196_electoral college_electoral_popular vote_electoral votes',
 '197_aged_age_poorly_aging']

Dynamic Topic Modelling

# Get topics over time
topics_over_time = topic_model.topics_over_time(
    comments,
    timestamps,
    datetime_format="%Y_%m_%d",
    global_tuning=True,
    evolution_tuning=True,
    nr_bins=20,
)
16it [02:08,  8.06s/it]

Topics over Time Plot

import plotly.io as pio

pio.renderers.default = "notebook+vscode+jupyterlab"
topic_model.visualize_topics_over_time(
    topics_over_time, top_n_topics=10, height=700, width=1200
)

Normalized Topics over Time Plot

topic_model.visualize_topics_over_time(
    topics_over_time, top_n_topics=10, height=700, width=1200, normalize_frequency=True
)